Analysis based primarily on the Sun and Vera et. al. 2020 paper with the code from https://github.com/BROOKELAB/SingleCell/tree/fa170e4c16f64670bada6ef56ae9cf1ffafb4786 and the Seurat database vignettes https://github.com/satijalab/seurat/blob/master/vignettes/pbmc3k_tutorial.Rmd
Preliminary analysis with regressing out the cell cycle markers that skew the clustering

# load required libraries
library(Seurat)
library(ggplot2)
library(sctransform)
library(dplyr)

Read in the .h5 file using Seurat function. Load in the count matrix as a Seurat object.

# read in Cal07 filtered data
cal07_a549_counts <- Read10X_h5(
        "/Users/ethayer/Google Drive/Grad School/Brooke Lab/Data/polyIC_scRNAseq/03_scRNAseq_h5/Cal07_aggr/filtered_feature_bc_matrix.h5",
        use.names = TRUE, unique.features = TRUE)
# load the data into Seurat for analysis
a549 <- CreateSeuratObject(counts = cal07_a549_counts, min.cells = 4, 
                           min.features = 400, project = "cal07_mock")

Organize the data into two levels based off of the barcode ids (in this case Cal07(1) or Mock(2)).

# split the rownames by -, then select only the second element 
# (the number 1 (cal07) or 2 (mock))
experiment_id <- factor(sapply(strsplit(rownames(a549@meta.data), "-"), 
                               function(x) x[2]))
# set the factor levels
levels(experiment_id) <- c("Cal07", "Mock")

# Add it back to the seurat object meta-data
a549@meta.data$experiment_id <- experiment_id

Look at QC metrics of the data.

# Show QC metrics for the first/last 5 cells
head(a549@meta.data, 5)
##                    orig.ident nCount_RNA nFeature_RNA experiment_id
## AAACCCAAGGTAGACC-1 cal07_mock      43266         5875         Cal07
## AAACCCAAGGTAGCAC-1 cal07_mock       3008          538         Cal07
## AAACCCACACGACTAT-1 cal07_mock      46286         5888         Cal07
## AAACCCAGTAGCTCGC-1 cal07_mock      40966         5854         Cal07
## AAACGAAAGTCTAACC-1 cal07_mock      35460         5214         Cal07
summary(a549@meta.data$experiment_id)
## Cal07  Mock 
##  5019  7243
# Visualize QC metrics as a violin plot
VlnPlot(a549, features = c("nFeature_RNA", "nCount_RNA"), ncol = 2)

# Look at the relationship between counts and features
FeatureScatter(a549, feature1 = "nCount_RNA", feature2 = "nFeature_RNA",
               group.by =  "experiment_id")

Filtering based on QC. Low feature could mean this is a droplet. High count/feature indicates a doublet.

# Subset to a new object based off of the QC data 
a549_01 <- subset(a549, 
                  subset = nFeature_RNA > 500 & nFeature_RNA < 8500 & nCount_RNA < 115000)

# Do all the QC stuff again with the new object
head(a549_01@meta.data, 5)
##                    orig.ident nCount_RNA nFeature_RNA experiment_id
## AAACCCAAGGTAGACC-1 cal07_mock      43266         5875         Cal07
## AAACCCAAGGTAGCAC-1 cal07_mock       3008          538         Cal07
## AAACCCACACGACTAT-1 cal07_mock      46286         5888         Cal07
## AAACCCAGTAGCTCGC-1 cal07_mock      40966         5854         Cal07
## AAACGAAAGTCTAACC-1 cal07_mock      35460         5214         Cal07
# Visualize QC metrics as a violin plot
VlnPlot(a549_01, features = c("nFeature_RNA", "nCount_RNA"), ncol = 2)

# Look at the relationship between counts and features
FeatureScatter(a549_01, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", 
               group.by =  "experiment_id")

Loading in the cell cycle markers from Tirosh et. al., 2015 with Seurat. We can then assign cell cycle scores.

# Loading in the markers
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes

# Assign cell cycle scores based off of the imported cycle markers above
a549_01 <- CellCycleScoring(
        a549_01,
        s.features = s.genes,
        g2m.features = g2m.genes,
        set.ident = TRUE)

# view cell cycle scores and phase assignments
head(a549_01[[]])
##                    orig.ident nCount_RNA nFeature_RNA experiment_id
## AAACCCAAGGTAGACC-1 cal07_mock      43266         5875         Cal07
## AAACCCAAGGTAGCAC-1 cal07_mock       3008          538         Cal07
## AAACCCACACGACTAT-1 cal07_mock      46286         5888         Cal07
## AAACCCAGTAGCTCGC-1 cal07_mock      40966         5854         Cal07
## AAACGAAAGTCTAACC-1 cal07_mock      35460         5214         Cal07
## AAACGAAGTGTAAATG-1 cal07_mock      33306         5056         Cal07
##                         S.Score  G2M.Score Phase  old.ident
## AAACCCAAGGTAGACC-1 -0.386699507 -6.9018137    G1 cal07_mock
## AAACCCAAGGTAGCAC-1 -0.007799672 -0.5129248    G1 cal07_mock
## AAACCCACACGACTAT-1 -1.211617406 -7.4227568    G1 cal07_mock
## AAACCCAGTAGCTCGC-1 -1.257594417 -6.7613211    G1 cal07_mock
## AAACGAAAGTCTAACC-1 -0.964901478 -6.0331424    G1 cal07_mock
## AAACGAAGTGTAAATG-1 -0.397783251 -5.4420962    G1 cal07_mock
# different scaling method which allows you to specify variables contributing to the scaling/normalization of the data
a549_01 <- SCTransform(
        a549_01, method = "glmGamPoi",
        vars.to.regress = c("S.Score", "G2M.Score"),
        verbose = FALSE)

Looking at which dimensions contribute the most to the percentage of variance

# PCA to find similar genes/cells for clustering
a549_01 <- RunPCA(a549_01, verbose = FALSE)
# Best explanation from https://github.com/satijalab/seurat/blob/master/vignettes/pbmc3k_tutorial.Rmd:
# "‘Elbow plot’: a ranking of principle components based on the percentage of variance explained by each one"
# I chose 15, as there is little drop off after that and it is best to be conservative when picking your dimensions to not leave anything out
ElbowPlot(a549_01)

Performing dimensional reduction and clustering

# making UMAP dimensional reduction
a549_01 <- RunUMAP(a549_01, dims = 1:15, verbose = FALSE)
a549_01 <- FindNeighbors(a549_01, dims = 1:15, verbose = FALSE)
a549_01 <- FindClusters(a549_01, verbose = FALSE, resolution = 0.5)
DimPlot(a549_01, label = TRUE) + NoLegend()

Want to make sure the cell cycle genes are regressed out

# loop to plot each UMAP with colored cell cycle genes one by one
for (x in g2m.genes[g2m.genes %in% VariableFeatures(a549)]) {
    g <- FeaturePlot(a549_01, x, order = TRUE, pt.size = 1)  
    print(g)
}

# UMAP color/clustering based on classification of cell cycle
# making sure the cell cycle genes are widely distributed
FeaturePlot(a549_01, "G2M.Score")

Identifying the most variable genes

# identify features that are outliers on a 'mean variability plot'
a549_01 <- FindVariableFeatures(a549_01, selection.method = "vst", nfeatures = 2000)

# Identify the 25 most highly variable genes
top25 <- head(VariableFeatures(a549_01), 15)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(a549_01)
plot2 <- LabelPoints(plot = plot1, points = top25, repel = TRUE,
                     max.overlaps = 20, xnudge = 0, ynudge = 0)
plot2

Where in the clustering are the viral genes being expressed

# where are the infected cells in the dimensional reduction
segment_gene_names <- c("NS", "NA.", "HA", "NP", "PB2", "PB1", "PA", "M")

# loop to plot each UMAP with colored flu genes one by one
for (x in segment_gene_names) {
    g <- FeaturePlot(a549_01, x, order = TRUE, pt.size = 1)  
    print(g)
}

Calculating the viral score

Still needed to do: + set cutoffs for each viral gene based off the first local minimum + see what % of the population is infected + look at IFN & ISG landscape

Organizing to just look at Cal07-exposed population (hopefully infected)

# subetting to select only cells from the Cal07 population
just_cal <- subset(x = a549_01, subset = experiment_id == "Cal07")
just_cal
## An object of class Seurat 
## 43039 features across 4963 samples within 2 assays 
## Active assay: SCT (21250 features, 2000 variable features)
##  1 other assay present: RNA
##  2 dimensional reductions calculated: pca, umap

Setting cutoffs and quickly trying to look at genes… ending here for now

# This will calculate the number of UMI that match flu segments in each cell
# starting with the subset that only has the cal07 population
flu_counts <- colSums(GetAssayData(just_cal, slot = "count")[segment_gene_names,])
all_counts <- colSums(GetAssayData(just_cal, slot = "count"))
# calculating the percentage of flu in population
just_cal$flu_pct_count <- flu_counts / all_counts
just_cal$log_flu_pct_count <- log10((just_cal$flu_pct_count * 100) + 1)

# This will calculate the number of UMI that match flu segments in each cell 
# this is the aggregated object
flu_counts2 <- colSums(GetAssayData(a549_01, slot = "count")[segment_gene_names,])
all_counts2 <- colSums(GetAssayData(a549_01, slot = "count"))

a549_01$flu_pct_count2 <- flu_counts2 / all_counts2
a549_01$log_flu_pct_count2 <- log10((a549_01$flu_pct_count2 * 100) + 1)

# getting a density line from the subset data
des.all <- density(just_cal$log_flu_pct_count)
# finding first local minima and setting as the cutoff
min.all <- des.all$x[which(diff(sign(diff(des.all$y)))==2)+1]

# density plot of aggregated data with the cutoff from subset
qplot(x = a549_01$log_flu_pct_count2, geom = "density") + scale_y_sqrt() +
        geom_vline(xintercept = min.all[1])

FeaturePlot(a549_01, "log_flu_pct_count2")

a549_01$is_infected <- a549_01$log_flu_pct_count2 > min.all[1]
FeaturePlot(a549_01, "is_infected")

FeaturePlot(a549_01, "SLFN5", split.by = "experiment_id", keep.scale = "all")

VlnPlot(a549_01, "SLFN5", split.by = "is_infected", group.by = "experiment_id")

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] dplyr_1.0.7        sctransform_0.3.2  ggplot2_3.3.5      SeuratObject_4.0.4
## [5] Seurat_4.0.6      
## 
## loaded via a namespace (and not attached):
##   [1] plyr_1.8.6                  igraph_1.2.10              
##   [3] lazyeval_0.2.2              splines_4.0.3              
##   [5] listenv_0.8.0               scattermore_0.7            
##   [7] GenomeInfoDb_1.26.7         digest_0.6.29              
##   [9] htmltools_0.5.2             fansi_1.0.2                
##  [11] magrittr_2.0.2              tensor_1.5                 
##  [13] cluster_2.1.0               ROCR_1.0-11                
##  [15] globals_0.14.0              matrixStats_0.61.0         
##  [17] spatstat.sparse_2.1-0       colorspace_2.0-2           
##  [19] ggrepel_0.9.1               xfun_0.26                  
##  [21] crayon_1.4.2                RCurl_1.98-1.5             
##  [23] jsonlite_1.7.3              spatstat.data_2.1-2        
##  [25] survival_3.2-7              zoo_1.8-9                  
##  [27] glue_1.6.1                  polyclip_1.10-0            
##  [29] gtable_0.3.0                zlibbioc_1.36.0            
##  [31] XVector_0.30.0              leiden_0.3.9               
##  [33] DelayedArray_0.16.3         future.apply_1.8.1         
##  [35] BiocGenerics_0.36.1         abind_1.4-5                
##  [37] scales_1.1.1                DBI_1.1.1                  
##  [39] miniUI_0.1.1.1              Rcpp_1.0.8                 
##  [41] viridisLite_0.4.0           xtable_1.8-4               
##  [43] reticulate_1.22             spatstat.core_2.3-2        
##  [45] bit_4.0.4                   stats4_4.0.3               
##  [47] htmlwidgets_1.5.4           httr_1.4.2                 
##  [49] RColorBrewer_1.1-2          ellipsis_0.3.2             
##  [51] ica_1.0-2                   pkgconfig_2.0.3            
##  [53] farver_2.1.0                sass_0.4.0                 
##  [55] uwot_0.1.11                 deldir_1.0-6               
##  [57] utf8_1.2.2                  tidyselect_1.1.1           
##  [59] labeling_0.4.2              rlang_1.0.0                
##  [61] reshape2_1.4.4              later_1.3.0                
##  [63] munsell_0.5.0               tools_4.0.3                
##  [65] cli_3.1.1                   generics_0.1.0             
##  [67] ggridges_0.5.3              evaluate_0.14              
##  [69] stringr_1.4.0               fastmap_1.1.0              
##  [71] yaml_2.2.1                  goftest_1.2-3              
##  [73] knitr_1.36                  bit64_4.0.5                
##  [75] fitdistrplus_1.1-6          purrr_0.3.4                
##  [77] RANN_2.6.1                  sparseMatrixStats_1.2.1    
##  [79] pbapply_1.5-0               future_1.23.0              
##  [81] nlme_3.1-149                mime_0.12                  
##  [83] hdf5r_1.3.5                 compiler_4.0.3             
##  [85] rstudioapi_0.13             plotly_4.10.0              
##  [87] png_0.1-7                   spatstat.utils_2.3-0       
##  [89] tibble_3.1.6                bslib_0.3.1                
##  [91] glmGamPoi_1.2.0             stringi_1.7.6              
##  [93] highr_0.9                   RSpectra_0.16-0            
##  [95] lattice_0.20-41             Matrix_1.4-0               
##  [97] vctrs_0.3.8                 pillar_1.7.0               
##  [99] lifecycle_1.0.1             spatstat.geom_2.3-1        
## [101] lmtest_0.9-39               jquerylib_0.1.4            
## [103] RcppAnnoy_0.0.19            data.table_1.14.2          
## [105] cowplot_1.1.1               bitops_1.0-7               
## [107] irlba_2.3.5                 httpuv_1.6.3               
## [109] patchwork_1.1.1             GenomicRanges_1.42.0       
## [111] R6_2.5.1                    promises_1.2.0.1           
## [113] KernSmooth_2.23-17          gridExtra_2.3              
## [115] IRanges_2.24.1              parallelly_1.30.0          
## [117] codetools_0.2-16            MASS_7.3-53                
## [119] assertthat_0.2.1            SummarizedExperiment_1.20.0
## [121] withr_2.4.3                 S4Vectors_0.28.1           
## [123] GenomeInfoDbData_1.2.4      mgcv_1.8-33                
## [125] parallel_4.0.3              grid_4.0.3                 
## [127] rpart_4.1-15                tidyr_1.1.4                
## [129] DelayedMatrixStats_1.12.3   rmarkdown_2.11             
## [131] MatrixGenerics_1.2.1        Rtsne_0.15                 
## [133] Biobase_2.50.0              shiny_1.7.1